# ------------------------------------------------------------------------------------
# Notes:
# This version is compatible with Andrew Chen and Tom Zimmermann’s updated dataset released in November 2025.
# To extract short-leg and long-leg classifications for each Anomaly–PERMNO–YearMonth combination, run this "signalbase.py" script in python. This will generate the dataset "signalbase_2006_2021.csv".
# "SignalDoc.csv" and "signed_predictors_dl_wide.csv" are provided by Andrew Chen and Tom Zimmermann, available for download at 
import numpy as np
import pandas as pd
import os
os.chdir('/Volumes/T9/Replicate File-202506/') 

SignalDoc=pd.read_csv('data/SignalDoc.csv') 
SignalDoc.columns=[x.replace('.','_').replace(' ','_').replace('-','_') for x in SignalDoc.columns]  # Standardize the variable names
keep_Acronym=SignalDoc[SignalDoc['Predictability_in_OP'].isin(['1_clear','2_likely'])]['Acronym'].tolist()
print(f"Loaded {len(keep_Acronym)} predictable acronyms. First 5: {keep_Acronym[:5]}")

import wrds
db = wrds.Connection(wrds_username='your_wrds_username')  # Replace with your WRDS username
crsp_msf = db.raw_sql("""
                      select a.permno, a.date,a.ret, a.shrout,a.prc,b.shrcd, b.exchcd,c.dlstcd, c.dlret
                      from crsp.msf as a
                      left join crsp.msenames as b
                      on a.permno=b.permno
                      and b.namedt<=a.date
                      and a.date<=b.nameendt
                      left join crsp.msedelist as c
                      on a.permno=c.permno and date_trunc('month', a.date) = date_trunc('month', c.dlstdt)
                      where a.date between '2004-12-01'and '2024-12-31'
                      """ )

# ------Start: incorporate CRSP delisting returns
# This code block is adapted from https://github.com/OpenSourceAP/CrossSection/blob/master/Signals/pyCode/DataDownloads/CRSPMonthly.py
mask1 = (crsp_msf['dlret'].isna() &
         ((crsp_msf['dlstcd'] == 500) |
          ((crsp_msf['dlstcd'] >= 520) & (crsp_msf['dlstcd'] <= 584))) &
         ((crsp_msf['exchcd'] == 1) | (crsp_msf['exchcd'] == 2)))
crsp_msf.loc[mask1, 'dlret'] = -0.35

mask2 = (crsp_msf['dlret'].isna() &
         ((crsp_msf['dlstcd'] == 500) |
          ((crsp_msf['dlstcd'] >= 520) & (crsp_msf['dlstcd'] <= 584))) &
         (crsp_msf['exchcd'] == 3))
crsp_msf.loc[mask2, 'dlret'] = -0.55

crsp_msf.loc[(crsp_msf['dlret'] < -1) & crsp_msf['dlret'].notna(), 'dlret'] = -1
crsp_msf['dlret'] = crsp_msf['dlret'].fillna(0)

crsp_msf['ret'] = (1 + crsp_msf['ret']) * (1 + crsp_msf['dlret']) - 1
mask3 = crsp_msf['ret'].isna() & (crsp_msf['dlret'] != 0)
crsp_msf.loc[mask3, 'ret'] = crsp_msf.loc[mask3, 'dlret']
# ------End: incorporate CRSP delisting returns

crsp_msf[['permno','shrcd','exchcd']]=crsp_msf[['permno','shrcd','exchcd']].astype('int', errors='ignore')
crsp_msf['me']=crsp_msf['shrout']*np.abs(crsp_msf['prc'])
crsp_msf['yyyymm'] = [x[:4]+x[5:7] for x in crsp_msf['date']]
crsp_msf['yyyymm']=crsp_msf['yyyymm'].astype(int)
crsp_msf['Price']=crsp_msf['prc']*(-1)
crsp_msf['Size']=crsp_msf['prc']*crsp_msf['shrout']*(-1)
crsp_msf['STreversal']=crsp_msf['ret']*(-1)
print(len(crsp_msf))

df=pd.read_csv('data/signed_predictors_dl_wide.csv') 
df=df[df['yyyymm']>=200412]
df=pd.merge(df,crsp_msf[['permno','yyyymm','Price','Size','STreversal']],on=['permno','yyyymm'],how='left')
info=pd.read_csv('data/permno_shrcd_exchcd_monthly.csv',usecols=['permno','yyyymm','shrcd','exchcd']) # Create a monthly time series for each PERMNO based on CRSP.msenames
df=pd.merge(df,info,on=['permno','yyyymm'],how='left')
df = df[(df['shrcd'].isin([10, 11, 12]))&(df['exchcd'].isin([1, 2, 3]))].drop(['shrcd','exchcd'],axis=1)

# Melt from wide to long format
df2 = df.melt(id_vars=['permno', 'yyyymm'], var_name='Anomaly', value_name='value')
df2=df2[df2['Anomaly'].isin(keep_Acronym)]
df2['value'] = df2['value'].replace([np.inf, -np.inf], np.nan)

def get_decile_rank(series):
    if series.isnull().all() or len(series.dropna()) < 10: 
        return pd.Series(np.nan, index=series.index)
    return pd.qcut(series,q=10,labels=False,duplicates='drop')

df2['rank'] = df2.groupby(['Anomaly', 'yyyymm'])['value'].transform(get_decile_rank)
df2['short_leg'] = np.where(df2['rank']==0,1,0)
df2['long_leg'] = np.where(df2['rank']==9,1,0)
df2.loc[df2['rank'].isnull(), ['short_leg', 'long_leg']] = pd.NA
df2['jdate'] = pd.to_datetime(df2['yyyymm'].astype(str), format='%Y%m')+pd.DateOffset(months=1)
df2['yyyymm']=(df2['jdate'].dt.strftime('%Y%m')).astype(int)
df2=df2[df2['short_leg'].notnull()][['permno','yyyymm','Anomaly','short_leg','long_leg']]
df2[(df2['yyyymm']>=200601)&(df2['yyyymm']<=202112)].to_csv('data/signalbase_2006_2021.csv',index=False)
# df2[df2['yyyymm']>=202312].to_csv('data/signalbase_table4.csv',index=False)
print(len(df),len(df2))
print(df2[:3])




